import warnings
warnings.filterwarnings('ignore')
import pandas as pd #library for data manipulation and analysis
import numpy as np #library used for working with arrays
import matplotlib.pyplot as plt #library for plots and visualizations
import seaborn as sns #library for visualizations
from scipy.stats import norm #using normal distibutions
from scipy.stats import zscore
#Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split # Sklearn package's randomized data splitting function
#DEsision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import metrics
#Random Forest
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
import scipy.stats as stats
from sklearn import metrics
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
#To install xgboost library use - !pip install xgboost
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
%matplotlib inline
pd.set_option('display.float_format', lambda x: '%.2f' % x) # To supress numerical display in scientific notations
import scipy.stats #library contains a number of probibility distributions and statistical functions
df = pd.read_csv('Tourism.csv')
df.head()
| CustomerID | ProdTaken | Age | TypeofContact | CityTier | DurationOfPitch | Occupation | Gender | NumberOfPersonVisiting | NumberOfFollowups | ProductPitched | PreferredPropertyStar | MaritalStatus | NumberOfTrips | Passport | PitchSatisfactionScore | OwnCar | NumberOfChildrenVisiting | Designation | MonthlyIncome | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 200000 | 1 | 41.00 | Self Enquiry | 3 | 6.00 | Salaried | Female | 3 | 3.00 | Deluxe | 3.00 | Single | 1.00 | 1 | 2 | 1 | 0.00 | Manager | 20993.00 |
| 1 | 200001 | 0 | 49.00 | Company Invited | 1 | 14.00 | Salaried | Male | 3 | 4.00 | Deluxe | 4.00 | Divorced | 2.00 | 0 | 3 | 1 | 2.00 | Manager | 20130.00 |
| 2 | 200002 | 1 | 37.00 | Self Enquiry | 1 | 8.00 | Free Lancer | Male | 3 | 4.00 | Basic | 3.00 | Single | 7.00 | 1 | 3 | 0 | 0.00 | Executive | 17090.00 |
| 3 | 200003 | 0 | 33.00 | Company Invited | 1 | 9.00 | Salaried | Female | 2 | 3.00 | Basic | 3.00 | Divorced | 2.00 | 1 | 5 | 1 | 1.00 | Executive | 17909.00 |
| 4 | 200004 | 0 | NaN | Self Enquiry | 1 | 8.00 | Small Business | Male | 2 | 3.00 | Basic | 4.00 | Divorced | 1.00 | 0 | 5 | 1 | 0.00 | Executive | 18468.00 |
df.describe(include='all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| CustomerID | 4888.00 | NaN | NaN | NaN | 202443.50 | 1411.19 | 200000.00 | 201221.75 | 202443.50 | 203665.25 | 204887.00 |
| ProdTaken | 4888.00 | NaN | NaN | NaN | 0.19 | 0.39 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| Age | 4662.00 | NaN | NaN | NaN | 37.62 | 9.32 | 18.00 | 31.00 | 36.00 | 44.00 | 61.00 |
| TypeofContact | 4863 | 2 | Self Enquiry | 3444 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| CityTier | 4888.00 | NaN | NaN | NaN | 1.65 | 0.92 | 1.00 | 1.00 | 1.00 | 3.00 | 3.00 |
| DurationOfPitch | 4637.00 | NaN | NaN | NaN | 15.49 | 8.52 | 5.00 | 9.00 | 13.00 | 20.00 | 127.00 |
| Occupation | 4888 | 4 | Salaried | 2368 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Gender | 4888 | 3 | Male | 2916 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| NumberOfPersonVisiting | 4888.00 | NaN | NaN | NaN | 2.91 | 0.72 | 1.00 | 2.00 | 3.00 | 3.00 | 5.00 |
| NumberOfFollowups | 4843.00 | NaN | NaN | NaN | 3.71 | 1.00 | 1.00 | 3.00 | 4.00 | 4.00 | 6.00 |
| ProductPitched | 4888 | 5 | Basic | 1842 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| PreferredPropertyStar | 4862.00 | NaN | NaN | NaN | 3.58 | 0.80 | 3.00 | 3.00 | 3.00 | 4.00 | 5.00 |
| MaritalStatus | 4888 | 4 | Married | 2340 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| NumberOfTrips | 4748.00 | NaN | NaN | NaN | 3.24 | 1.85 | 1.00 | 2.00 | 3.00 | 4.00 | 22.00 |
| Passport | 4888.00 | NaN | NaN | NaN | 0.29 | 0.45 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 |
| PitchSatisfactionScore | 4888.00 | NaN | NaN | NaN | 3.08 | 1.37 | 1.00 | 2.00 | 3.00 | 4.00 | 5.00 |
| OwnCar | 4888.00 | NaN | NaN | NaN | 0.62 | 0.49 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 |
| NumberOfChildrenVisiting | 4822.00 | NaN | NaN | NaN | 1.19 | 0.86 | 0.00 | 1.00 | 1.00 | 2.00 | 3.00 |
| Designation | 4888 | 5 | Executive | 1842 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| MonthlyIncome | 4655.00 | NaN | NaN | NaN | 23619.85 | 5380.70 | 1000.00 | 20346.00 | 22347.00 | 25571.00 | 98678.00 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4888 entries, 0 to 4887 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 4888 non-null int64 1 ProdTaken 4888 non-null int64 2 Age 4662 non-null float64 3 TypeofContact 4863 non-null object 4 CityTier 4888 non-null int64 5 DurationOfPitch 4637 non-null float64 6 Occupation 4888 non-null object 7 Gender 4888 non-null object 8 NumberOfPersonVisiting 4888 non-null int64 9 NumberOfFollowups 4843 non-null float64 10 ProductPitched 4888 non-null object 11 PreferredPropertyStar 4862 non-null float64 12 MaritalStatus 4888 non-null object 13 NumberOfTrips 4748 non-null float64 14 Passport 4888 non-null int64 15 PitchSatisfactionScore 4888 non-null int64 16 OwnCar 4888 non-null int64 17 NumberOfChildrenVisiting 4822 non-null float64 18 Designation 4888 non-null object 19 MonthlyIncome 4655 non-null float64 dtypes: float64(7), int64(7), object(6) memory usage: 763.9+ KB
#Lets see the count and the percentage of missing values in each column
# selecting the instances where missing value is greater than 0
pd.DataFrame({'Count':df.isnull().sum()[df.isnull().sum()>0],'Percentage':(df.isnull().sum()[df.isnull().sum()>0]/df.shape[0])*100})
| Count | Percentage | |
|---|---|---|
| Age | 226 | 4.62 |
| TypeofContact | 25 | 0.51 |
| DurationOfPitch | 251 | 5.14 |
| NumberOfFollowups | 45 | 0.92 |
| PreferredPropertyStar | 26 | 0.53 |
| NumberOfTrips | 140 | 2.86 |
| NumberOfChildrenVisiting | 66 | 1.35 |
| MonthlyIncome | 233 | 4.77 |
df['TypeofContact'].value_counts(dropna=False)
Self Enquiry 3444 Company Invited 1419 NaN 25 Name: TypeofContact, dtype: int64
df = df.dropna()
pd.DataFrame({'Count':df.isnull().sum()[df.isnull().sum()>0],'Percentage':(df.isnull().sum()[df.isnull().sum()>0]/df.shape[0])*100})
| Count | Percentage |
|---|
#################################CHECK DUPLICATES
#check for duplicated rows
df.duplicated().sum()
0
df.drop(['CustomerID'],axis=1,inplace=True)
# Defining the function for creating boxplot and hisogram
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (12,7))
kde: whether to show the density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize) # creating the 2 subplots
sns.boxplot(data=data, x=feature, ax=ax_box2, showmeans=True, color="mediumturquoise") # boxplot will be created and a star will indicate the mean value of the column
if bins:
sns.histplot(data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, color="mediumpurple")
else:
sns.histplot(data=data, x=feature, kde=kde, ax=ax_hist2, color="mediumpurple") # For histogram
ax_hist2.axvline(data[feature].mean(), color="green", linestyle="--") # Add mean to the histogram
ax_hist2.axvline(data[feature].median(), color="black", linestyle="-") # Add median to the histogram
columns_list = []
for i in df.columns:
if df.dtypes[i]!=object:
columns_list.append(i)
for i in columns_list:
histogram_boxplot(df,i) #you can't use this part without the defined functions
############################Treating outliers
# outlier detection using boxplot - UNIVARIATE FIRST
# selecting the numerical columns of data and adding their names in a list
numeric_columns = ['NumberOfTrips', 'DurationOfPitch', 'MonthlyIncome']
plt.figure(figsize=(15, 12))
for i, variable in enumerate(numeric_columns):
plt.subplot(4, 4, i + 1)
plt.boxplot(df[variable], whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
#IQR
# to find the 25th percentile and 75th percentile for the numerical columns.
Q1 = df[numeric_columns].quantile(0.25)
Q3 = df[numeric_columns].quantile(0.75)
IQR = Q3 - Q1 #Inter Quantile Range (75th percentile - 25th percentile)
lower_whisker = Q1 - 1.5*IQR #Finding lower and upper bounds for all values. All values outside these bounds are outliers
upper_whisker = Q3 + 1.5*IQR
# Percentage of outliers in each column
((df[numeric_columns] < lower_whisker) | (df[numeric_columns] > upper_whisker)).sum()/df.shape[0]*100
NumberOfTrips 2.59 DurationOfPitch 0.02 MonthlyIncome 5.40 dtype: float64
def treat_outliers(df, col):
"""
treats outliers in a variable
col: str, name of the numerical variable
df: dataframe
col: name of the column
"""
Q1 = df[col].quantile(0.25) # 25th quantile
Q3 = df[col].quantile(0.75) # 75th quantile
IQR = Q3 - Q1 # Inter Quantile Range (75th perentile - 25th percentile)
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR
# all the values smaller than lower_whisker will be assigned the value of lower_whisker
# all the values greater than upper_whisker will be assigned the value of upper_whisker
# the assignment will be done by using the clip function of NumPy
df[col] = np.clip(df[col], lower_whisker, upper_whisker)
return df
#check the outliers
#Treating outliers in Rooms column
df = treat_outliers(df,'NumberOfTrips')
df = treat_outliers(df,'DurationOfPitch')
df = treat_outliers(df,'MonthlyIncome')
# visualizing the column after outlier treatment
# outlier detection using boxplot
# selecting the numerical columns where outliers were treated
numeric_columns = ['NumberOfTrips', 'DurationOfPitch', 'MonthlyIncome']
plt.figure(figsize=(15, 12))
for i, variable in enumerate(numeric_columns):
plt.subplot(4, 4, i + 1)
plt.boxplot(df[variable], whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 4128 entries, 0 to 4887 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ProdTaken 4128 non-null int64 1 Age 4128 non-null float64 2 TypeofContact 4128 non-null object 3 CityTier 4128 non-null int64 4 DurationOfPitch 4128 non-null float64 5 Occupation 4128 non-null object 6 Gender 4128 non-null object 7 NumberOfPersonVisiting 4128 non-null int64 8 NumberOfFollowups 4128 non-null float64 9 ProductPitched 4128 non-null object 10 PreferredPropertyStar 4128 non-null float64 11 MaritalStatus 4128 non-null object 12 NumberOfTrips 4128 non-null float64 13 Passport 4128 non-null int64 14 PitchSatisfactionScore 4128 non-null int64 15 OwnCar 4128 non-null int64 16 NumberOfChildrenVisiting 4128 non-null float64 17 Designation 4128 non-null object 18 MonthlyIncome 4128 non-null float64 dtypes: float64(7), int64(6), object(6) memory usage: 645.0+ KB
df.head()
| ProdTaken | Age | TypeofContact | CityTier | DurationOfPitch | Occupation | Gender | NumberOfPersonVisiting | NumberOfFollowups | ProductPitched | PreferredPropertyStar | MaritalStatus | NumberOfTrips | Passport | PitchSatisfactionScore | OwnCar | NumberOfChildrenVisiting | Designation | MonthlyIncome | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 41.00 | Self Enquiry | 3 | 6.00 | Salaried | Female | 3 | 3.00 | Deluxe | 3.00 | Single | 1.00 | 1 | 2 | 1 | 0.00 | Manager | 20993.00 |
| 1 | 0 | 49.00 | Company Invited | 1 | 14.00 | Salaried | Male | 3 | 4.00 | Deluxe | 4.00 | Divorced | 2.00 | 0 | 3 | 1 | 2.00 | Manager | 20130.00 |
| 2 | 1 | 37.00 | Self Enquiry | 1 | 8.00 | Free Lancer | Male | 3 | 4.00 | Basic | 3.00 | Single | 7.00 | 1 | 3 | 0 | 0.00 | Executive | 17090.00 |
| 3 | 0 | 33.00 | Company Invited | 1 | 9.00 | Salaried | Female | 2 | 3.00 | Basic | 3.00 | Divorced | 2.00 | 1 | 5 | 1 | 1.00 | Executive | 17909.00 |
| 5 | 0 | 32.00 | Company Invited | 1 | 8.00 | Salaried | Male | 3 | 3.00 | Basic | 3.00 | Single | 1.00 | 0 | 5 | 1 | 1.00 | Executive | 18068.00 |
df.loc[df['Gender'] == 'Fe Male', 'Gender'] = 'Female'
df["Gender"].unique()
array(['Female', 'Male'], dtype=object)
Questions:
sns.histplot(data=df,x='MonthlyIncome',stat='density')
plt.show()
sns.countplot(data=df,x='ProductPitched')
plt.xticks(rotation=90)
plt.show()
sns.catplot(x='Occupation', y='ProdTaken', data=df, kind="bar", height=8, aspect=2)
plt.xticks(rotation=90)
plt.show()
sns.catplot(x='MaritalStatus', y='ProdTaken', data=df, kind="bar", height=8, aspect=2)
plt.xticks(rotation=90)
plt.show()
################################. Create Dummy variables
#below redefining buckets for each case
replaceStruct = {
"Gender": {"Female": 1, "Male": 0}
}
oneHotCols=["TypeofContact","Occupation","ProductPitched","MaritalStatus", "Designation"]
#reassign the values in the dataframe
df=df.replace(replaceStruct)
df=pd.get_dummies(df, columns=oneHotCols)
df.head(10)
| ProdTaken | Age | CityTier | DurationOfPitch | Gender | NumberOfPersonVisiting | NumberOfFollowups | PreferredPropertyStar | NumberOfTrips | Passport | ... | ProductPitched_Super Deluxe | MaritalStatus_Divorced | MaritalStatus_Married | MaritalStatus_Single | MaritalStatus_Unmarried | Designation_AVP | Designation_Executive | Designation_Manager | Designation_Senior Manager | Designation_VP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 41.00 | 3 | 6.00 | 1 | 3 | 3.00 | 3.00 | 1.00 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 1 | 0 | 49.00 | 1 | 14.00 | 0 | 3 | 4.00 | 4.00 | 2.00 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 2 | 1 | 37.00 | 1 | 8.00 | 0 | 3 | 4.00 | 3.00 | 7.00 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3 | 0 | 33.00 | 1 | 9.00 | 1 | 2 | 3.00 | 3.00 | 2.00 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 5 | 0 | 32.00 | 1 | 8.00 | 0 | 3 | 3.00 | 3.00 | 1.00 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 6 | 0 | 59.00 | 1 | 9.00 | 1 | 2 | 2.00 | 5.00 | 5.00 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 7 | 0 | 30.00 | 1 | 30.00 | 0 | 3 | 3.00 | 3.00 | 2.00 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 8 | 0 | 38.00 | 1 | 29.00 | 0 | 2 | 4.00 | 3.00 | 1.00 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 9 | 0 | 36.00 | 1 | 33.00 | 0 | 3 | 3.00 | 3.00 | 7.00 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 10 | 0 | 35.00 | 1 | 22.00 | 0 | 2 | 2.00 | 4.00 | 1.00 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
10 rows × 34 columns
# looking at value counts for non-numeric features
num_to_display = 10 # defining this up here so it's easy to change later if I want
for colname in df.dtypes[df.dtypes == 'object'].index:
val_counts = df[colname].value_counts(dropna=False) # i want to see NA counts
print(val_counts[:num_to_display])
if len(val_counts) > num_to_display:
print(f'Only displaying first {num_to_display} of {len(val_counts)} values.')
print('\n\n') # just for more space between
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 4128 entries, 0 to 4887 Data columns (total 34 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ProdTaken 4128 non-null int64 1 Age 4128 non-null float64 2 CityTier 4128 non-null int64 3 DurationOfPitch 4128 non-null float64 4 Gender 4128 non-null int64 5 NumberOfPersonVisiting 4128 non-null int64 6 NumberOfFollowups 4128 non-null float64 7 PreferredPropertyStar 4128 non-null float64 8 NumberOfTrips 4128 non-null float64 9 Passport 4128 non-null int64 10 PitchSatisfactionScore 4128 non-null int64 11 OwnCar 4128 non-null int64 12 NumberOfChildrenVisiting 4128 non-null float64 13 MonthlyIncome 4128 non-null float64 14 TypeofContact_Company Invited 4128 non-null uint8 15 TypeofContact_Self Enquiry 4128 non-null uint8 16 Occupation_Free Lancer 4128 non-null uint8 17 Occupation_Large Business 4128 non-null uint8 18 Occupation_Salaried 4128 non-null uint8 19 Occupation_Small Business 4128 non-null uint8 20 ProductPitched_Basic 4128 non-null uint8 21 ProductPitched_Deluxe 4128 non-null uint8 22 ProductPitched_King 4128 non-null uint8 23 ProductPitched_Standard 4128 non-null uint8 24 ProductPitched_Super Deluxe 4128 non-null uint8 25 MaritalStatus_Divorced 4128 non-null uint8 26 MaritalStatus_Married 4128 non-null uint8 27 MaritalStatus_Single 4128 non-null uint8 28 MaritalStatus_Unmarried 4128 non-null uint8 29 Designation_AVP 4128 non-null uint8 30 Designation_Executive 4128 non-null uint8 31 Designation_Manager 4128 non-null uint8 32 Designation_Senior Manager 4128 non-null uint8 33 Designation_VP 4128 non-null uint8 dtypes: float64(7), int64(7), uint8(20) memory usage: 564.4 KB
# plot the heatmap
corr = df.corr()
plt.figure(figsize=(15, 7))
sns.heatmap(corr, annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()
import pandas as pd
pd.set_option('display.max_rows', None)
# Create correlation matrix
corr_mat = df.corr(method='pearson')
# Drop correlations where the variable names are the same
corr_mat = corr_mat.mask(np.tril(np.ones_like(corr_mat, dtype=bool)))
# Convert correlation matrix to 1-D Series and sort
sorted_mat = corr_mat.unstack().drop_duplicates().sort_values()
# Remove correlations with a value of 1 or null values
sorted_mat = sorted_mat[(sorted_mat != 1) & (~sorted_mat.isnull())]
print(sorted_mat)
TypeofContact_Self Enquiry TypeofContact_Company Invited -1.00
Occupation_Small Business Occupation_Salaried -0.83
ProductPitched_Basic MonthlyIncome -0.62
Designation_Executive ProductPitched_Deluxe -0.58
ProductPitched_Deluxe ProductPitched_Basic -0.58
MaritalStatus_Married MaritalStatus_Divorced -0.47
MaritalStatus_Unmarried MaritalStatus_Married -0.43
MaritalStatus_Single MaritalStatus_Married -0.42
ProductPitched_Standard ProductPitched_Basic -0.37
Designation_Executive ProductPitched_Standard -0.37
ProductPitched_Basic Age -0.37
ProductPitched_Standard ProductPitched_Deluxe -0.34
Occupation_Salaried Occupation_Large Business -0.31
Occupation_Small Business Occupation_Large Business -0.27
ProductPitched_Basic CityTier -0.26
MaritalStatus_Unmarried MaritalStatus_Divorced -0.22
MaritalStatus_Single MaritalStatus_Divorced -0.21
ProductPitched_Super Deluxe ProductPitched_Basic -0.20
Designation_Executive ProductPitched_Super Deluxe -0.20
MaritalStatus_Unmarried MaritalStatus_Single -0.20
ProductPitched_Super Deluxe ProductPitched_Deluxe -0.18
Designation_Manager ProductPitched_Super Deluxe -0.18
MaritalStatus_Single MonthlyIncome -0.17
Designation_Manager MaritalStatus_Single -0.16
MaritalStatus_Single ProductPitched_Deluxe -0.16
Age ProdTaken -0.15
Occupation_Salaried CityTier -0.15
MaritalStatus_Single NumberOfPersonVisiting -0.15
ProductPitched_Deluxe ProdTaken -0.14
MonthlyIncome ProdTaken -0.14
ProductPitched_King ProductPitched_Basic -0.13
NumberOfPersonVisiting -0.13
MaritalStatus_Married ProdTaken -0.12
MaritalStatus_Single ProductPitched_Standard -0.12
Designation_Senior Manager ProductPitched_Super Deluxe -0.12
ProductPitched_Super Deluxe ProductPitched_Standard -0.12
ProductPitched_King ProductPitched_Deluxe -0.12
MaritalStatus_Unmarried ProductPitched_Basic -0.11
MaritalStatus_Single Age -0.11
ProductPitched_Super Deluxe NumberOfPersonVisiting -0.10
MaritalStatus_Single NumberOfChildrenVisiting -0.10
MaritalStatus_Unmarried ProductPitched_Super Deluxe -0.10
ProductPitched_Super Deluxe NumberOfFollowups -0.08
MaritalStatus_Single NumberOfTrips -0.08
ProductPitched_Super Deluxe NumberOfChildrenVisiting -0.08
ProdTaken -0.08
ProductPitched_Standard ProductPitched_King -0.07
MaritalStatus_Divorced ProdTaken -0.07
MaritalStatus_Unmarried ProductPitched_King -0.07
ProductPitched_King DurationOfPitch -0.07
NumberOfChildrenVisiting -0.07
ProductPitched_Basic Occupation_Small Business -0.07
NumberOfTrips -0.07
ProductPitched_King NumberOfFollowups -0.06
TypeofContact_Self Enquiry ProdTaken -0.06
Designation_Executive MaritalStatus_Married -0.05
MaritalStatus_Married ProductPitched_Basic -0.05
ProductPitched_Basic OwnCar -0.05
Occupation_Salaried DurationOfPitch -0.05
Occupation_Small Business PitchSatisfactionScore -0.05
ProductPitched_Basic DurationOfPitch -0.04
ProductPitched_Deluxe Occupation_Large Business -0.04
MaritalStatus_Single DurationOfPitch -0.04
ProductPitched_Basic Gender -0.04
MaritalStatus_Single CityTier -0.04
ProductPitched_King ProdTaken -0.04
MaritalStatus_Married Gender -0.04
ProductPitched_King CityTier -0.04
ProductPitched_Super Deluxe ProductPitched_King -0.04
MaritalStatus_Single NumberOfFollowups -0.04
MaritalStatus_Married NumberOfFollowups -0.04
ProductPitched_Deluxe PitchSatisfactionScore -0.04
Occupation_Small Business OwnCar -0.04
Occupation_Large Business TypeofContact_Company Invited -0.04
ProductPitched_Standard ProdTaken -0.04
Occupation_Salaried TypeofContact_Self Enquiry -0.03
MaritalStatus_Unmarried Age -0.03
Occupation_Salaried ProdTaken -0.03
MaritalStatus_Divorced TypeofContact_Company Invited -0.03
ProductPitched_King TypeofContact_Company Invited -0.03
TypeofContact_Company Invited PreferredPropertyStar -0.03
PreferredPropertyStar Age -0.03
NumberOfChildrenVisiting Age -0.03
MaritalStatus_Married CityTier -0.03
PitchSatisfactionScore -0.03
PitchSatisfactionScore CityTier -0.03
Gender ProdTaken -0.03
Occupation_Free Lancer OwnCar -0.03
ProductPitched_King PreferredPropertyStar -0.03
OwnCar Passport -0.03
Occupation_Large Business DurationOfPitch -0.03
Occupation_Salaried PreferredPropertyStar -0.03
ProductPitched_Standard Occupation_Salaried -0.03
ProductPitched_King NumberOfTrips -0.03
ProductPitched_Standard NumberOfChildrenVisiting -0.03
NumberOfFollowups Age -0.03
MaritalStatus_Divorced NumberOfFollowups -0.03
MaritalStatus_Unmarried Passport -0.02
ProductPitched_Super Deluxe Occupation_Large Business -0.02
MaritalStatus_Single OwnCar -0.02
NumberOfPersonVisiting Age -0.02
ProductPitched_Deluxe MonthlyIncome -0.02
TypeofContact_Self Enquiry NumberOfFollowups -0.02
PitchSatisfactionScore -0.02
Occupation_Salaried MonthlyIncome -0.02
ProductPitched_Deluxe Occupation_Salaried -0.02
Occupation_Large Business Age -0.02
ProductPitched_Deluxe TypeofContact_Self Enquiry -0.02
MaritalStatus_Divorced ProductPitched_Basic -0.02
OwnCar NumberOfTrips -0.02
Occupation_Free Lancer MonthlyIncome -0.02
Occupation_Salaried Occupation_Free Lancer -0.02
MaritalStatus_Married Occupation_Free Lancer -0.02
ProductPitched_Deluxe PreferredPropertyStar -0.02
ProductPitched_Standard NumberOfPersonVisiting -0.02
Occupation_Large Business MonthlyIncome -0.02
PitchSatisfactionScore PreferredPropertyStar -0.02
ProductPitched_King Occupation_Large Business -0.02
Occupation_Free Lancer DurationOfPitch -0.02
Occupation_Small Business Occupation_Free Lancer -0.02
MaritalStatus_Unmarried TypeofContact_Self Enquiry -0.02
ProductPitched_King Gender -0.02
Occupation_Free Lancer NumberOfChildrenVisiting -0.02
Gender -0.02
PreferredPropertyStar NumberOfFollowups -0.02
MaritalStatus_Married Occupation_Large Business -0.02
PitchSatisfactionScore NumberOfPersonVisiting -0.02
NumberOfChildrenVisiting Gender -0.02
TypeofContact_Self Enquiry DurationOfPitch -0.02
PreferredPropertyStar CityTier -0.02
Occupation_Free Lancer PreferredPropertyStar -0.02
ProductPitched_King Passport -0.02
ProductPitched_Deluxe Occupation_Free Lancer -0.02
Occupation_Free Lancer CityTier -0.02
ProductPitched_Basic TypeofContact_Company Invited -0.02
Occupation_Small Business ProdTaken -0.02
ProductPitched_Deluxe Passport -0.01
Occupation_Free Lancer TypeofContact_Company Invited -0.01
MaritalStatus_Unmarried Occupation_Small Business -0.01
ProductPitched_Super Deluxe PreferredPropertyStar -0.01
ProductPitched_Standard Passport -0.01
ProductPitched_Deluxe OwnCar -0.01
MaritalStatus_Single Occupation_Salaried -0.01
Occupation_Salaried NumberOfTrips -0.01
Occupation_Small Business TypeofContact_Company Invited -0.01
Occupation_Large Business CityTier -0.01
TypeofContact_Company Invited NumberOfTrips -0.01
Age -0.01
MaritalStatus_Unmarried PitchSatisfactionScore -0.01
MaritalStatus_Divorced Occupation_Free Lancer -0.01
MaritalStatus_Single PreferredPropertyStar -0.01
ProductPitched_Standard Occupation_Free Lancer -0.01
NumberOfTrips CityTier -0.01
Occupation_Salaried Passport -0.01
MaritalStatus_Unmarried Occupation_Free Lancer -0.01
MaritalStatus_Divorced NumberOfChildrenVisiting -0.01
Occupation_Salaried NumberOfChildrenVisiting -0.01
Occupation_Small Business NumberOfFollowups -0.01
OwnCar ProdTaken -0.01
TypeofContact_Self Enquiry NumberOfChildrenVisiting -0.01
CityTier -0.01
MaritalStatus_Unmarried DurationOfPitch -0.01
MaritalStatus_Married OwnCar -0.01
MaritalStatus_Single TypeofContact_Self Enquiry -0.01
PitchSatisfactionScore NumberOfTrips -0.01
TypeofContact_Self Enquiry NumberOfPersonVisiting -0.01
DurationOfPitch Age -0.01
Occupation_Salaried NumberOfFollowups -0.01
Age -0.01
ProductPitched_King Occupation_Salaried -0.01
Occupation_Salaried Gender -0.01
Occupation_Large Business Occupation_Free Lancer -0.01
MaritalStatus_Unmarried PreferredPropertyStar -0.01
MaritalStatus_Single Occupation_Small Business -0.01
Gender DurationOfPitch -0.01
MaritalStatus_Divorced Passport -0.01
Occupation_Small Business NumberOfTrips -0.01
TypeofContact_Self Enquiry Gender -0.01
MaritalStatus_Married TypeofContact_Self Enquiry -0.01
ProductPitched_Super Deluxe Occupation_Free Lancer -0.01
PreferredPropertyStar DurationOfPitch -0.01
MaritalStatus_Married Occupation_Salaried -0.00
TypeofContact_Company Invited MonthlyIncome -0.00
MaritalStatus_Divorced Occupation_Large Business -0.00
ProductPitched_Super Deluxe TypeofContact_Self Enquiry -0.00
ProductPitched_King Occupation_Free Lancer -0.00
MaritalStatus_Divorced PreferredPropertyStar -0.00
Occupation_Large Business NumberOfPersonVisiting -0.00
TypeofContact_Company Invited OwnCar -0.00
ProductPitched_Standard TypeofContact_Self Enquiry -0.00
PitchSatisfactionScore -0.00
Occupation_Salaried NumberOfPersonVisiting -0.00
Occupation_Free Lancer PitchSatisfactionScore -0.00
Occupation_Large Business Passport -0.00
MaritalStatus_Divorced Occupation_Small Business -0.00
TypeofContact_Self Enquiry Passport -0.00
MaritalStatus_Divorced Gender -0.00
MaritalStatus_Unmarried Occupation_Large Business 0.00
OwnCar DurationOfPitch 0.00
NumberOfChildrenVisiting PitchSatisfactionScore 0.00
Occupation_Free Lancer Age 0.00
TypeofContact_Company Invited Passport 0.00
NumberOfTrips Gender 0.00
Passport NumberOfFollowups 0.00
PreferredPropertyStar 0.00
NumberOfPersonVisiting CityTier 0.00
PitchSatisfactionScore Gender 0.00
MaritalStatus_Married Passport 0.00
Designation_Manager MaritalStatus_Divorced 0.00
MaritalStatus_Divorced ProductPitched_Deluxe 0.00
ProductPitched_Standard 0.00
Designation_Senior Manager MaritalStatus_Divorced 0.00
MonthlyIncome Passport 0.00
PreferredPropertyStar 0.00
ProductPitched_Basic NumberOfFollowups 0.00
NumberOfChildrenVisiting CityTier 0.00
Occupation_Small Business NumberOfChildrenVisiting 0.00
ProductPitched_Standard TypeofContact_Company Invited 0.00
ProductPitched_Basic PreferredPropertyStar 0.00
TypeofContact_Self Enquiry OwnCar 0.00
Occupation_Small Business NumberOfPersonVisiting 0.00
MaritalStatus_Divorced Occupation_Salaried 0.00
NumberOfPersonVisiting ProdTaken 0.00
ProductPitched_Super Deluxe TypeofContact_Company Invited 0.00
Occupation_Salaried 0.00
Occupation_Large Business Gender 0.00
TypeofContact_Self Enquiry MonthlyIncome 0.00
NumberOfFollowups Gender 0.00
ProductPitched_Basic PitchSatisfactionScore 0.00
Passport CityTier 0.00
CityTier Age 0.01
PitchSatisfactionScore NumberOfFollowups 0.01
Occupation_Small Business Gender 0.01
MaritalStatus_Married TypeofContact_Company Invited 0.01
Passport NumberOfPersonVisiting 0.01
TypeofContact_Company Invited Gender 0.01
NumberOfTrips PreferredPropertyStar 0.01
ProductPitched_Deluxe Gender 0.01
ProductPitched_Standard Occupation_Small Business 0.01
Designation_AVP MaritalStatus_Married 0.01
MaritalStatus_Married ProductPitched_Super Deluxe 0.01
TypeofContact_Company Invited NumberOfPersonVisiting 0.01
MaritalStatus_Divorced CityTier 0.01
MaritalStatus_Single TypeofContact_Company Invited 0.01
Passport NumberOfTrips 0.01
NumberOfTrips ProdTaken 0.01
DurationOfPitch CityTier 0.01
TypeofContact_Company Invited CityTier 0.01
NumberOfChildrenVisiting 0.01
NumberOfChildrenVisiting ProdTaken 0.01
Occupation_Small Business Passport 0.01
MaritalStatus_Unmarried NumberOfTrips 0.01
ProductPitched_Standard NumberOfTrips 0.01
OwnCar NumberOfPersonVisiting 0.01
PitchSatisfactionScore Passport 0.01
OwnCar NumberOfFollowups 0.01
ProductPitched_Super Deluxe Occupation_Small Business 0.01
TypeofContact_Self Enquiry Age 0.01
MaritalStatus_Divorced NumberOfTrips 0.01
Occupation_Large Business PreferredPropertyStar 0.01
MaritalStatus_Unmarried OwnCar 0.01
TypeofContact_Self Enquiry NumberOfTrips 0.01
Occupation_Small Business TypeofContact_Self Enquiry 0.01
NumberOfTrips DurationOfPitch 0.01
PitchSatisfactionScore DurationOfPitch 0.01
OwnCar CityTier 0.01
MaritalStatus_Married ProductPitched_King 0.01
Designation_VP MaritalStatus_Married 0.01
MaritalStatus_Divorced ProductPitched_King 0.01
Designation_VP MaritalStatus_Divorced 0.01
PitchSatisfactionScore Age 0.01
Occupation_Large Business NumberOfChildrenVisiting 0.01
MaritalStatus_Divorced DurationOfPitch 0.01
MaritalStatus_Unmarried Occupation_Salaried 0.01
Occupation_Free Lancer TypeofContact_Self Enquiry 0.01
NumberOfPersonVisiting Gender 0.01
OwnCar Gender 0.01
ProductPitched_Basic TypeofContact_Self Enquiry 0.02
MaritalStatus_Married PreferredPropertyStar 0.02
ProductPitched_Super Deluxe Passport 0.02
MaritalStatus_Married Occupation_Small Business 0.02
OwnCar PreferredPropertyStar 0.02
NumberOfChildrenVisiting Passport 0.02
NumberOfFollowups DurationOfPitch 0.02
Occupation_Free Lancer NumberOfFollowups 0.02
MaritalStatus_Single PitchSatisfactionScore 0.02
Occupation_Free Lancer NumberOfPersonVisiting 0.02
MaritalStatus_Married NumberOfChildrenVisiting 0.02
TypeofContact_Company Invited DurationOfPitch 0.02
Gender CityTier 0.02
MaritalStatus_Unmarried TypeofContact_Company Invited 0.02
ProductPitched_King Occupation_Small Business 0.02
PitchSatisfactionScore 0.02
Occupation_Small Business Age 0.02
PreferredPropertyStar 0.02
MaritalStatus_Married NumberOfPersonVisiting 0.02
MaritalStatus_Single Gender 0.02
ProductPitched_Basic Passport 0.02
ProductPitched_Standard DurationOfPitch 0.02
Occupation_Salaried OwnCar 0.02
PitchSatisfactionScore 0.02
ProductPitched_Deluxe TypeofContact_Company Invited 0.02
ProductPitched_Basic NumberOfChildrenVisiting 0.02
MaritalStatus_Divorced OwnCar 0.02
TypeofContact_Company Invited PitchSatisfactionScore 0.02
NumberOfFollowups 0.02
ProductPitched_Standard NumberOfFollowups 0.02
ProductPitched_Super Deluxe DurationOfPitch 0.02
MonthlyIncome PitchSatisfactionScore 0.02
Designation_Manager MaritalStatus_Married 0.02
MaritalStatus_Married ProductPitched_Deluxe 0.02
MaritalStatus_Divorced NumberOfPersonVisiting 0.03
NumberOfFollowups CityTier 0.03
ProductPitched_Super Deluxe CityTier 0.03
Passport Age 0.03
ProductPitched_Basic Occupation_Free Lancer 0.03
MaritalStatus_Married ProductPitched_Standard 0.03
Designation_Senior Manager MaritalStatus_Married 0.03
MaritalStatus_Married DurationOfPitch 0.03
Occupation_Large Business NumberOfTrips 0.03
NumberOfFollowups 0.03
NumberOfChildrenVisiting OwnCar 0.03
MaritalStatus_Single Occupation_Large Business 0.03
Passport Gender 0.03
Occupation_Large Business OwnCar 0.03
ProductPitched_King OwnCar 0.03
PreferredPropertyStar Gender 0.03
Gender Age 0.03
MaritalStatus_Single Passport 0.03
ProductPitched_Super Deluxe NumberOfTrips 0.03
MaritalStatus_Divorced ProductPitched_Super Deluxe 0.03
TypeofContact_Self Enquiry PreferredPropertyStar 0.03
ProductPitched_King TypeofContact_Self Enquiry 0.03
MaritalStatus_Divorced PitchSatisfactionScore 0.03
TypeofContact_Self Enquiry 0.03
ProductPitched_Standard Gender 0.03
ProductPitched_Deluxe Age 0.03
MaritalStatus_Divorced MonthlyIncome 0.03
Occupation_Free Lancer Passport 0.03
Occupation_Salaried TypeofContact_Company Invited 0.03
ProductPitched_Basic Occupation_Large Business 0.03
ProductPitched_Standard Occupation_Large Business 0.03
MonthlyIncome DurationOfPitch 0.03
ProductPitched_Super Deluxe Gender 0.04
Occupation_Small Business MonthlyIncome 0.04
MaritalStatus_Unmarried Gender 0.04
Occupation_Large Business TypeofContact_Self Enquiry 0.04
OwnCar Age 0.04
MaritalStatus_Married MonthlyIncome 0.04
NumberOfChildrenVisiting DurationOfPitch 0.04
ProductPitched_Standard OwnCar 0.04
Passport DurationOfPitch 0.04
ProductPitched_Deluxe DurationOfPitch 0.04
NumberOfFollowups 0.04
ProductPitched_Basic NumberOfPersonVisiting 0.04
MonthlyIncome Gender 0.04
ProductPitched_Standard PreferredPropertyStar 0.04
PreferredPropertyStar NumberOfPersonVisiting 0.04
NumberOfChildrenVisiting PreferredPropertyStar 0.04
Occupation_Large Business PitchSatisfactionScore 0.04
MaritalStatus_Single ProductPitched_King 0.04
Designation_VP MaritalStatus_Single 0.04
ProductPitched_Basic Occupation_Salaried 0.04
MaritalStatus_Married NumberOfTrips 0.04
Occupation_Free Lancer ProdTaken 0.05
MaritalStatus_Divorced Age 0.05
ProductPitched_Super Deluxe OwnCar 0.05
Occupation_Free Lancer NumberOfTrips 0.05
ProductPitched_Deluxe Occupation_Small Business 0.05
MaritalStatus_Single Occupation_Free Lancer 0.05
PitchSatisfactionScore ProdTaken 0.05
ProductPitched_Deluxe NumberOfTrips 0.05
MaritalStatus_Single ProductPitched_Super Deluxe 0.05
Designation_AVP MaritalStatus_Single 0.05
TypeofContact_Company Invited ProdTaken 0.06
MaritalStatus_Unmarried ProdTaken 0.06
ProductPitched_Super Deluxe PitchSatisfactionScore 0.06
ProductPitched_Deluxe NumberOfChildrenVisiting 0.06
ProductPitched_Standard CityTier 0.07
Occupation_Small Business DurationOfPitch 0.07
ProductPitched_Deluxe NumberOfPersonVisiting 0.07
MaritalStatus_Married Age 0.07
MonthlyIncome OwnCar 0.07
OwnCar PitchSatisfactionScore 0.07
MaritalStatus_Unmarried CityTier 0.08
MonthlyIncome 0.08
NumberOfPersonVisiting DurationOfPitch 0.08
Occupation_Large Business ProdTaken 0.08
MaritalStatus_Unmarried ProductPitched_Standard 0.08
DurationOfPitch ProdTaken 0.09
MaritalStatus_Unmarried NumberOfChildrenVisiting 0.09
NumberOfPersonVisiting 0.09
CityTier ProdTaken 0.09
PreferredPropertyStar ProdTaken 0.10
MonthlyIncome CityTier 0.10
NumberOfFollowups ProdTaken 0.11
MaritalStatus_Unmarried NumberOfFollowups 0.12
ProductPitched_Deluxe 0.12
Designation_Manager MaritalStatus_Unmarried 0.12
NumberOfTrips NumberOfFollowups 0.13
MonthlyIncome NumberOfTrips 0.14
NumberOfFollowups 0.15
NumberOfPersonVisiting 0.16
Occupation_Small Business CityTier 0.16
MonthlyIncome NumberOfChildrenVisiting 0.16
ProductPitched_Standard Age 0.17
ProductPitched_King Age 0.17
NumberOfChildrenVisiting NumberOfTrips 0.17
NumberOfTrips Age 0.19
NumberOfPersonVisiting 0.19
MaritalStatus_Single ProdTaken 0.19
Designation_Executive MaritalStatus_Single 0.21
MaritalStatus_Single ProductPitched_Basic 0.21
ProductPitched_Deluxe CityTier 0.22
ProductPitched_Basic ProdTaken 0.22
Passport ProdTaken 0.27
NumberOfChildrenVisiting NumberOfFollowups 0.28
ProductPitched_Super Deluxe Age 0.31
NumberOfFollowups NumberOfPersonVisiting 0.32
ProductPitched_King MonthlyIncome 0.34
ProductPitched_Standard MonthlyIncome 0.40
MonthlyIncome Age 0.43
ProductPitched_Super Deluxe MonthlyIncome 0.47
NumberOfChildrenVisiting NumberOfPersonVisiting 0.60
dtype: float64
# Libraries to build decision tree classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
# To tune different models
from sklearn.model_selection import GridSearchCV
# To get diferent metric scores
from sklearn.metrics import (
f1_score,
accuracy_score,
recall_score,
precision_score,
confusion_matrix,
# plot_confusion_matrix,
make_scorer,
)
X = df.drop(["ProdTaken"], axis=1)
y = df["ProdTaken"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
print("Number of rows in train data =", X_train.shape[0])
print("Number of rows in test data =", X_test.shape[0])
Number of rows in train data = 2889 Number of rows in test data = 1239
print("Percentage of classes in training set:")
print(y_train.value_counts(normalize=True))
print("Percentage of classes in test set:")
print(y_test.value_counts(normalize=True))
Percentage of classes in training set: 0 0.81 1 0.19 Name: ProdTaken, dtype: float64 Percentage of classes in test set: 0 0.80 1 0.20 Name: ProdTaken, dtype: float64
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{"Accuracy": acc, "Recall": recall, "Precision": precision, "F1": f1,},
index=[0],
)
return df_perf
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
####Building classifier
model = DecisionTreeClassifier(criterion="gini", random_state=1)
model.fit(X_train, y_train)
DecisionTreeClassifier(random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(random_state=1)
######. Creating a decision tree model
decision_tree_perf_train = model_performance_classification_sklearn(
model, X_train, y_train
)
decision_tree_perf_train
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.00 | 1.00 | 1.00 | 1.00 |
confusion_matrix_sklearn(model, X_train, y_train)
decision_tree_perf_test = model_performance_classification_sklearn(
model, X_test, y_test
)
decision_tree_perf_test
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.90 | 0.76 | 0.76 | 0.76 |
confusion_matrix_sklearn(model, X_test, y_test)
column_names = list(X.columns)
feature_names = column_names
print(feature_names)
['Age', 'CityTier', 'DurationOfPitch', 'Gender', 'NumberOfPersonVisiting', 'NumberOfFollowups', 'PreferredPropertyStar', 'NumberOfTrips', 'Passport', 'PitchSatisfactionScore', 'OwnCar', 'NumberOfChildrenVisiting', 'MonthlyIncome', 'TypeofContact_Company Invited', 'TypeofContact_Self Enquiry', 'Occupation_Free Lancer', 'Occupation_Large Business', 'Occupation_Salaried', 'Occupation_Small Business', 'ProductPitched_Basic', 'ProductPitched_Deluxe', 'ProductPitched_King', 'ProductPitched_Standard', 'ProductPitched_Super Deluxe', 'MaritalStatus_Divorced', 'MaritalStatus_Married', 'MaritalStatus_Single', 'MaritalStatus_Unmarried', 'Designation_AVP', 'Designation_Executive', 'Designation_Manager', 'Designation_Senior Manager', 'Designation_VP']
plt.figure(figsize=(20, 30))
out = tree.plot_tree(
model,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=True,
class_names=True,
)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
importances = model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
####Hyperparameter tuning on Decision Tree
# Choose the type of classifier.
estimator = DecisionTreeClassifier(random_state=1)
# Grid of parameters to choose from
parameters = {
"max_depth": [np.arange(2, 50, 5), None],
"criterion": ["entropy", "gini"],
"splitter": ["best", "random"],
"min_impurity_decrease": [0.000001, 0.00001, 0.0001],
}
# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(recall_score)
# Run the grid search
grid_obj = GridSearchCV(estimator, parameters, scoring=acc_scorer, cv=5)
grid_obj = grid_obj.fit(X_train, y_train)
# Set the clf to the best combination of parameters
estimator = grid_obj.best_estimator_
# Fit the best algorithm to the data.
estimator.fit(X_train, y_train)
DecisionTreeClassifier(min_impurity_decrease=1e-06, random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(min_impurity_decrease=1e-06, random_state=1)
decision_tree_tune_perf_train = model_performance_classification_sklearn(
estimator, X_train, y_train
)
decision_tree_tune_perf_train
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.00 | 1.00 | 1.00 | 1.00 |
confusion_matrix_sklearn(estimator, X_train, y_train)
decision_tree_tune_perf_test = model_performance_classification_sklearn(
estimator, X_test, y_test
)
decision_tree_tune_perf_test
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.90 | 0.76 | 0.76 | 0.76 |
confusion_matrix_sklearn(estimator, X_test, y_test)
plt.figure(figsize=(15, 12))
tree.plot_tree(
estimator,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=True,
class_names=True,
)
plt.show()
clf = DecisionTreeClassifier(random_state=1)
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
pd.DataFrame(path)
| ccp_alphas | impurities | |
|---|---|---|
| 0 | 0.00 | 0.00 |
| 1 | 0.00 | 0.00 |
| 2 | 0.00 | 0.00 |
| 3 | 0.00 | 0.00 |
| 4 | 0.00 | 0.00 |
| 5 | 0.00 | 0.00 |
| 6 | 0.00 | 0.00 |
| 7 | 0.00 | 0.00 |
| 8 | 0.00 | 0.01 |
| 9 | 0.00 | 0.01 |
| 10 | 0.00 | 0.01 |
| 11 | 0.00 | 0.01 |
| 12 | 0.00 | 0.01 |
| 13 | 0.00 | 0.01 |
| 14 | 0.00 | 0.01 |
| 15 | 0.00 | 0.01 |
| 16 | 0.00 | 0.01 |
| 17 | 0.00 | 0.01 |
| 18 | 0.00 | 0.01 |
| 19 | 0.00 | 0.01 |
| 20 | 0.00 | 0.01 |
| 21 | 0.00 | 0.02 |
| 22 | 0.00 | 0.02 |
| 23 | 0.00 | 0.02 |
| 24 | 0.00 | 0.02 |
| 25 | 0.00 | 0.02 |
| 26 | 0.00 | 0.02 |
| 27 | 0.00 | 0.02 |
| 28 | 0.00 | 0.02 |
| 29 | 0.00 | 0.02 |
| 30 | 0.00 | 0.02 |
| 31 | 0.00 | 0.02 |
| 32 | 0.00 | 0.02 |
| 33 | 0.00 | 0.03 |
| 34 | 0.00 | 0.03 |
| 35 | 0.00 | 0.03 |
| 36 | 0.00 | 0.03 |
| 37 | 0.00 | 0.03 |
| 38 | 0.00 | 0.03 |
| 39 | 0.00 | 0.03 |
| 40 | 0.00 | 0.03 |
| 41 | 0.00 | 0.03 |
| 42 | 0.00 | 0.03 |
| 43 | 0.00 | 0.03 |
| 44 | 0.00 | 0.04 |
| 45 | 0.00 | 0.04 |
| 46 | 0.00 | 0.04 |
| 47 | 0.00 | 0.04 |
| 48 | 0.00 | 0.04 |
| 49 | 0.00 | 0.04 |
| 50 | 0.00 | 0.04 |
| 51 | 0.00 | 0.04 |
| 52 | 0.00 | 0.04 |
| 53 | 0.00 | 0.04 |
| 54 | 0.00 | 0.04 |
| 55 | 0.00 | 0.04 |
| 56 | 0.00 | 0.04 |
| 57 | 0.00 | 0.04 |
| 58 | 0.00 | 0.04 |
| 59 | 0.00 | 0.05 |
| 60 | 0.00 | 0.05 |
| 61 | 0.00 | 0.05 |
| 62 | 0.00 | 0.05 |
| 63 | 0.00 | 0.05 |
| 64 | 0.00 | 0.05 |
| 65 | 0.00 | 0.05 |
| 66 | 0.00 | 0.05 |
| 67 | 0.00 | 0.05 |
| 68 | 0.00 | 0.05 |
| 69 | 0.00 | 0.05 |
| 70 | 0.00 | 0.05 |
| 71 | 0.00 | 0.05 |
| 72 | 0.00 | 0.05 |
| 73 | 0.00 | 0.05 |
| 74 | 0.00 | 0.06 |
| 75 | 0.00 | 0.06 |
| 76 | 0.00 | 0.06 |
| 77 | 0.00 | 0.06 |
| 78 | 0.00 | 0.06 |
| 79 | 0.00 | 0.06 |
| 80 | 0.00 | 0.06 |
| 81 | 0.00 | 0.06 |
| 82 | 0.00 | 0.06 |
| 83 | 0.00 | 0.06 |
| 84 | 0.00 | 0.06 |
| 85 | 0.00 | 0.07 |
| 86 | 0.00 | 0.07 |
| 87 | 0.00 | 0.07 |
| 88 | 0.00 | 0.07 |
| 89 | 0.00 | 0.07 |
| 90 | 0.00 | 0.07 |
| 91 | 0.00 | 0.08 |
| 92 | 0.00 | 0.08 |
| 93 | 0.00 | 0.08 |
| 94 | 0.00 | 0.08 |
| 95 | 0.00 | 0.08 |
| 96 | 0.00 | 0.09 |
| 97 | 0.00 | 0.09 |
| 98 | 0.00 | 0.09 |
| 99 | 0.00 | 0.09 |
| 100 | 0.00 | 0.09 |
| 101 | 0.00 | 0.09 |
| 102 | 0.00 | 0.10 |
| 103 | 0.00 | 0.10 |
| 104 | 0.00 | 0.10 |
| 105 | 0.00 | 0.10 |
| 106 | 0.00 | 0.10 |
| 107 | 0.00 | 0.10 |
| 108 | 0.00 | 0.10 |
| 109 | 0.00 | 0.10 |
| 110 | 0.00 | 0.11 |
| 111 | 0.00 | 0.11 |
| 112 | 0.00 | 0.11 |
| 113 | 0.00 | 0.12 |
| 114 | 0.00 | 0.12 |
| 115 | 0.00 | 0.12 |
| 116 | 0.00 | 0.12 |
| 117 | 0.00 | 0.12 |
| 118 | 0.00 | 0.12 |
| 119 | 0.00 | 0.12 |
| 120 | 0.00 | 0.13 |
| 121 | 0.00 | 0.13 |
| 122 | 0.00 | 0.14 |
| 123 | 0.00 | 0.14 |
| 124 | 0.00 | 0.14 |
| 125 | 0.00 | 0.14 |
| 126 | 0.00 | 0.14 |
| 127 | 0.00 | 0.14 |
| 128 | 0.00 | 0.15 |
| 129 | 0.00 | 0.15 |
| 130 | 0.00 | 0.15 |
| 131 | 0.00 | 0.16 |
| 132 | 0.00 | 0.16 |
| 133 | 0.00 | 0.16 |
| 134 | 0.00 | 0.16 |
| 135 | 0.00 | 0.16 |
| 136 | 0.00 | 0.17 |
| 137 | 0.00 | 0.18 |
| 138 | 0.00 | 0.19 |
| 139 | 0.00 | 0.20 |
| 140 | 0.00 | 0.20 |
| 141 | 0.00 | 0.20 |
| 142 | 0.00 | 0.21 |
| 143 | 0.00 | 0.21 |
| 144 | 0.00 | 0.21 |
| 145 | 0.00 | 0.22 |
| 146 | 0.00 | 0.22 |
| 147 | 0.00 | 0.23 |
| 148 | 0.00 | 0.23 |
| 149 | 0.00 | 0.24 |
| 150 | 0.00 | 0.24 |
| 151 | 0.00 | 0.24 |
| 152 | 0.00 | 0.25 |
| 153 | 0.00 | 0.25 |
| 154 | 0.00 | 0.25 |
| 155 | 0.01 | 0.27 |
| 156 | 0.02 | 0.29 |
| 157 | 0.02 | 0.31 |
fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
plt.show()
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(random_state=1, ccp_alpha=ccp_alpha)
clf.fit(X_train, y_train)
clfs.append(clf)
print(
"Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
clfs[-1].tree_.node_count, ccp_alphas[-1]
)
)
Number of nodes in the last tree is: 1 with ccp_alpha: 0.021601827331998336
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
fig, ax = plt.subplots(2, 1, figsize=(10, 7))
ax[0].plot(ccp_alphas, node_counts, marker="o", drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")
ax[1].plot(ccp_alphas, depth, marker="o", drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
fig.tight_layout()
recall_train = []
for clf in clfs:
pred_train = clf.predict(X_train)
values_train = recall_score(y_train, pred_train)
recall_train.append(values_train)
recall_test = []
for clf in clfs:
pred_test = clf.predict(X_test)
values_test = recall_score(y_test, pred_test)
recall_test.append(values_test)
fig, ax = plt.subplots(figsize=(15, 5))
ax.set_xlabel("alpha")
ax.set_ylabel("Recall")
ax.set_title("Recall vs alpha for training and testing sets")
ax.plot(ccp_alphas, recall_train, marker="o", label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, recall_test, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()
decision_tree_postpruned_perf_train = model_performance_classification_sklearn(
best_model, X_train, y_train
)
decision_tree_postpruned_perf_train
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.00 | 1.00 | 1.00 | 1.00 |
decision_tree_postpruned_perf_test = model_performance_classification_sklearn(
best_model, X_test, y_test
)
decision_tree_postpruned_perf_test
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.90 | 0.76 | 0.76 | 0.76 |
# training performance comparison
models_train_comp_df = pd.concat(
[
decision_tree_perf_train.T,
decision_tree_tune_perf_train.T,
decision_tree_postpruned_perf_train.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Decision Tree sklearn",
"Decision Tree (Pre-Pruning)",
"Decision Tree (Post-Pruning)",
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| Decision Tree sklearn | Decision Tree (Pre-Pruning) | Decision Tree (Post-Pruning) | |
|---|---|---|---|
| Accuracy | 1.00 | 1.00 | 1.00 |
| Recall | 1.00 | 1.00 | 1.00 |
| Precision | 1.00 | 1.00 | 1.00 |
| F1 | 1.00 | 1.00 | 1.00 |
#Fitting the model
d_tree = DecisionTreeClassifier(random_state=1)
d_tree.fit(X_train,y_train)
#Calculating different metrics
d_tree_model_train_perf=model_performance_classification_sklearn(d_tree,X_train,y_train)
print("Training performance:\n",d_tree_model_train_perf)
d_tree_model_test_perf=model_performance_classification_sklearn(d_tree,X_test,y_test)
print("Testing performance:\n",d_tree_model_test_perf)
#Creating confusion matrix
confusion_matrix_sklearn(d_tree,X_test,y_test)
Training performance:
Accuracy Recall Precision F1
0 1.00 1.00 1.00 1.00
Testing performance:
Accuracy Recall Precision F1
0 0.90 0.76 0.76 0.76
######. Building a Random Forrest Model
#Fitting the model
rf_estimator = RandomForestClassifier(random_state=1)
rf_estimator.fit(X_train,y_train)
#Calculating different metrics
rf_estimator_model_train_perf=model_performance_classification_sklearn(rf_estimator,X_train,y_train)
print("Training performance:\n",rf_estimator_model_train_perf)
rf_estimator_model_test_perf=model_performance_classification_sklearn(rf_estimator,X_test,y_test)
print("Testing performance:\n",rf_estimator_model_test_perf)
#Creating confusion matrix
confusion_matrix_sklearn(rf_estimator,X_test,y_test)
Training performance:
Accuracy Recall Precision F1
0 1.00 1.00 1.00 1.00
Testing performance:
Accuracy Recall Precision F1
0 0.91 0.57 0.95 0.71
# Choose the type of classifier.
rf_tuned = RandomForestClassifier(class_weight={0:0.18,1:0.82},random_state=1,oob_score=True,bootstrap=True)
parameters = {
'max_depth': list(np.arange(5,30,5)) + [None]
,
'max_features': ['sqrt','log2',None]
,
'min_samples_leaf': np.arange(1,15,5)
,
'min_samples_split': np.arange(2, 20, 5)}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.f1_score)
# Run the grid search
grid_obj = GridSearchCV(rf_tuned, parameters, scoring=scorer, cv=5,n_jobs=-1)
grid_obj = grid_obj.fit(X_train, y_train)
# Set the clf to the best combination of parameters
rf_tuned = grid_obj.best_estimator_
# Fit the best algorithm to the data.
rf_tuned.fit(X_train, y_train)
RandomForestClassifier(class_weight={0: 0.18, 1: 0.82}, max_depth=20,
max_features=None, min_samples_split=7, oob_score=True,
random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomForestClassifier(class_weight={0: 0.18, 1: 0.82}, max_depth=20,
max_features=None, min_samples_split=7, oob_score=True,
random_state=1)#Calculating different metrics
rf_tuned_model_train_perf=model_performance_classification_sklearn(rf_tuned,X_train,y_train)
print("Training performance:\n",rf_tuned_model_train_perf)
rf_tuned_model_test_perf=model_performance_classification_sklearn(rf_tuned,X_test,y_test)
print("Testing performance:\n",rf_tuned_model_test_perf)
#Creating confusion matrix
confusion_matrix_sklearn(rf_tuned,X_test,y_test)
Training performance:
Accuracy Recall Precision F1
0 1.00 0.99 0.98 0.99
Testing performance:
Accuracy Recall Precision F1
0 0.90 0.65 0.83 0.73
# Define the parameter grid for hyperparameter tuning
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 5, 10],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'max_features': ['auto', 'sqrt'],
'bootstrap': [True, False],
'min_weight_fraction_leaf': [0.0, 0.1, 0.2]
}
# Create a random forest classifier
rf = RandomForestClassifier()
# Perform grid search cross-validation
grid_search = GridSearchCV(rf, param_grid, cv=5)
grid_search.fit(X_train, y_train)
# Print the best parameters and best score
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)
# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
# Get the best decision tree estimator
dtree_estimator = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print("Test Accuracy: ", test_accuracy)
Best Parameters: {'bootstrap': False, 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100}
Best Score: 0.9155409497880098
Test Accuracy: 0.9289749798224375
DecisionTreeClassifier(class_weight={0: 0.18, 1: 0.72}, random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. DecisionTreeClassifier(class_weight={0: 0.18, 1: 0.72}, random_state=1)#Calculating different metrics
dtree_estimator_model_train_perf=model_performance_classification_sklearn(dtree_estimator,X_train,y_train)
print("Training performance:\n",dtree_estimator_model_train_perf)
dtree_estimator_model_test_perf=model_performance_classification_sklearn(dtree_estimator,X_test,y_test)
print("Testing performance:\n",dtree_estimator_model_test_perf)
#Creating confusion matrix
confusion_matrix_sklearn(dtree_estimator,X_test,y_test)
Training performance:
Accuracy Recall Precision F1
0 1.00 1.00 1.00 1.00
Testing performance:
Accuracy Recall Precision F1
0 0.88 0.70 0.70 0.70
# Choose the type of classifier.
bagging_estimator_tuned = BaggingClassifier(random_state=1)
# Grid of parameters to choose from
parameters = {'max_samples': [0.7,0.8,0.9,1],
'max_features': [0.7,0.8,0.9,1],
'n_estimators' : [10,20,30,40,50],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.f1_score)
# Run the grid search
grid_obj = GridSearchCV(bagging_estimator_tuned, parameters, scoring=scorer,cv=5)
grid_obj = grid_obj.fit(X_train, y_train)
# Set the clf to the best combination of parameters
bagging_estimator_tuned = grid_obj.best_estimator_
# Fit the best algorithm to the data.
bagging_estimator_tuned.fit(X_train, y_train)
BaggingClassifier(max_features=0.9, max_samples=0.9, n_estimators=50,
random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. BaggingClassifier(max_features=0.9, max_samples=0.9, n_estimators=50,
random_state=1)#Fitting the model
bagging_classifier = BaggingClassifier(random_state=1)
bagging_classifier.fit(X_train,y_train)
#Calculating different metrics
bagging_classifier_model_train_perf=model_performance_classification_sklearn(bagging_classifier,X_train,y_train)
print(bagging_classifier_model_train_perf)
bagging_classifier_model_test_perf=model_performance_classification_sklearn(bagging_classifier,X_test,y_test)
print(bagging_classifier_model_test_perf)
#Creating confusion matrix
confusion_matrix_sklearn(bagging_classifier,X_test,y_test)
Accuracy Recall Precision F1 0 0.99 0.97 1.00 0.98 Accuracy Recall Precision F1 0 0.91 0.59 0.91 0.72
#Calculating different metrics
bagging_estimator_tuned_model_train_perf=model_performance_classification_sklearn(bagging_estimator_tuned,X_train,y_train)
print(bagging_estimator_tuned_model_train_perf)
bagging_estimator_tuned_model_test_perf=model_performance_classification_sklearn(bagging_estimator_tuned,X_test,y_test)
print(bagging_estimator_tuned_model_test_perf)
#Creating confusion matrix
confusion_matrix_sklearn(bagging_estimator_tuned,X_test,y_test)
Accuracy Recall Precision F1 0 1.00 1.00 1.00 1.00 Accuracy Recall Precision F1 0 0.92 0.64 0.95 0.77
#Fitting the model -- ADABOOOST
ab_classifier = AdaBoostClassifier(random_state=1)
ab_classifier.fit(X_train,y_train)
#Calculating different metrics
ab_classifier_model_train_perf=model_performance_classification_sklearn(ab_classifier,X_train,y_train)
print(ab_classifier_model_train_perf)
ab_classifier_model_test_perf=model_performance_classification_sklearn(ab_classifier,X_test,y_test)
print(ab_classifier_model_test_perf)
#Creating confusion matrix
confusion_matrix_sklearn(ab_classifier,X_test,y_test)
Accuracy Recall Precision F1 0 0.85 0.36 0.71 0.48 Accuracy Recall Precision F1 0 0.84 0.37 0.70 0.48
# Choose the type of classifier.
bagging_estimator_tuned = BaggingClassifier(random_state=1)
# Grid of parameters to choose from
parameters = { 'max_samples': [0.7,0.8,0.9,1],
'max_features': [0.7,0.8,0.9,1],
'n_estimators' : [10,20,30,40,50]
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.f1_score)
# Run the grid search
grid_obj = GridSearchCV(bagging_estimator_tuned, parameters, scoring=scorer,cv=5)
grid_obj = grid_obj.fit(X_train, y_train)
# Set the clf to the best combination of parameters
bagging_estimator_tuned = grid_obj.best_estimator_
# Fit the best algorithm to the data.
bagging_estimator_tuned.fit(X_train, y_train)
BaggingClassifier(max_features=0.9, max_samples=0.9, n_estimators=50,
random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. BaggingClassifier(max_features=0.9, max_samples=0.9, n_estimators=50,
random_state=1)#Calculating different metrics
abc_tuned_model_train_perf=model_performance_classification_sklearn(abc_tuned,X_train,y_train)
print(abc_tuned_model_train_perf)
abc_tuned_model_test_perf=model_performance_classification_sklearn(abc_tuned,X_test,y_test)
print(abc_tuned_model_test_perf)
#Creating confusion matrix
confusion_matrix_sklearn(abc_tuned,X_test,y_test)
Accuracy Recall Precision F1 0 1.00 1.00 1.00 1.00 Accuracy Recall Precision F1 0 0.90 0.64 0.81 0.71
#Fitting the model
gb_classifier = GradientBoostingClassifier(random_state=1)
gb_classifier.fit(X_train,y_train)
#Calculating different metrics
gb_classifier_model_train_perf=model_performance_classification_sklearn(gb_classifier,X_train,y_train)
print("Training performance:\n",gb_classifier_model_train_perf)
gb_classifier_model_test_perf=model_performance_classification_sklearn(gb_classifier,X_test,y_test)
print("Testing performance:\n",gb_classifier_model_test_perf)
#Creating confusion matrix
confusion_matrix_sklearn(gb_classifier,X_test,y_test)
Training performance:
Accuracy Recall Precision F1
0 0.90 0.51 0.90 0.65
Testing performance:
Accuracy Recall Precision F1
0 0.86 0.39 0.79 0.52
# Choose the type of classifier.
gbc_tuned = GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),random_state=1)
# Grid of parameters to choose from
parameters = {
"n_estimators": [100,150,200,250],
"subsample":[0.8,0.9,1],
"max_features":[0.7,0.8,0.9,1]
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.f1_score)
# Run the grid search
grid_obj = GridSearchCV(gbc_tuned, parameters, scoring=scorer,cv=5)
grid_obj = grid_obj.fit(X_train, y_train)
# Set the clf to the best combination of parameters
gbc_tuned = grid_obj.best_estimator_
# Fit the best algorithm to the data.
gbc_tuned.fit(X_train, y_train)
GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),
max_features=0.9, n_estimators=250, random_state=1,
subsample=0.8)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),
max_features=0.9, n_estimators=250, random_state=1,
subsample=0.8)AdaBoostClassifier(random_state=1)
AdaBoostClassifier(random_state=1)
#Calculating different metrics
gbc_tuned_model_train_perf=model_performance_classification_sklearn(gbc_tuned,X_train,y_train)
print("Training performance:\n",gbc_tuned_model_train_perf)
gbc_tuned_model_test_perf=model_performance_classification_sklearn(gbc_tuned,X_test,y_test)
print("Testing performance:\n",gbc_tuned_model_test_perf)
#Creating confusion matrix
confusion_matrix_sklearn(gbc_tuned,X_test,y_test)
Training performance:
Accuracy Recall Precision F1
0 0.93 0.70 0.95 0.80
Testing performance:
Accuracy Recall Precision F1
0 0.88 0.49 0.82 0.61
#Fitting the model
xgb_classifier = XGBClassifier(random_state=1, eval_metric='logloss')
xgb_classifier.fit(X_train,y_train)
#Calculating different metrics
xgb_classifier_model_train_perf=model_performance_classification_sklearn(xgb_classifier,X_train,y_train)
print("Training performance:\n",xgb_classifier_model_train_perf)
xgb_classifier_model_test_perf=model_performance_classification_sklearn(xgb_classifier,X_test,y_test)
print("Testing performance:\n",xgb_classifier_model_test_perf)
#Creating confusion matrix
confusion_matrix_sklearn(xgb_classifier,X_test,y_test)
Training performance:
Accuracy Recall Precision F1
0 1.00 1.00 1.00 1.00
Testing performance:
Accuracy Recall Precision F1
0 0.93 0.69 0.95 0.80
# Choose the type of classifier.
xgb_tuned = XGBClassifier(random_state=1, eval_metric='logloss')
# Grid of parameters to choose from
parameters = {
"n_estimators": [10,30,50],
"scale_pos_weight":[1,2,5],
"subsample":[0.7,0.9,1],
"learning_rate":[0.05, 0.1,0.2],
"colsample_bytree":[0.7,0.9,1],
"colsample_bylevel":[0.5,0.7,1]
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.f1_score)
# Run the grid search
grid_obj = GridSearchCV(xgb_tuned, parameters,scoring=scorer,cv=5)
grid_obj = grid_obj.fit(X_train, y_train)
# Set the clf to the best combination of parameters
xgb_tuned = grid_obj.best_estimator_
# Fit the best algorithm to the data.
xgb_tuned.fit(X_train, y_train)
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=1, colsample_bynode=None, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric='logloss', feature_types=None, gamma=None,
gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.2, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=50, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=1, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=1, colsample_bynode=None, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric='logloss', feature_types=None, gamma=None,
gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.2, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=50, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=1, ...)#Calculating different metrics
xgb_tuned_model_train_perf=model_performance_classification_sklearn(xgb_tuned,X_train,y_train)
print("Training performance:\n",xgb_tuned_model_train_perf)
xgb_tuned_model_test_perf=model_performance_classification_sklearn(xgb_tuned,X_test,y_test)
print("Testing performance:\n",xgb_tuned_model_test_perf)
#Creating confusion matrix
confusion_matrix_sklearn(xgb_tuned,X_test,y_test)
Training performance:
Accuracy Recall Precision F1
0 0.99 1.00 0.95 0.98
Testing performance:
Accuracy Recall Precision F1
0 0.90 0.75 0.75 0.75
# Libraries to import decision tree classifier and different ensemble classifiers
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
estimators = [('Random Forest',rf_tuned), ('Gradient Boosting',gbc_tuned), ('AdaBoost',abc_tuned)]
final_estimator = xgb_tuned
stacking_classifier= StackingClassifier(estimators=estimators,final_estimator=final_estimator)
stacking_classifier.fit(X_train,y_train)
StackingClassifier(estimators=[('Random Forest',
RandomForestClassifier(class_weight={0: 0.18,
1: 0.82},
max_depth=20,
max_features=None,
min_samples_split=7,
oob_score=True,
random_state=1)),
('Gradient Boosting',
GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),
max_features=0.9,
n_estimators=250,
random_state=1,
subsample=0.8)),
('AdaBoost',
AdaB...
gpu_id=None, grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=0.2,
max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None,
max_depth=None,
max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
n_estimators=50, n_jobs=None,
num_parallel_tree=None,
predictor=None, random_state=1, ...))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. StackingClassifier(estimators=[('Random Forest',
RandomForestClassifier(class_weight={0: 0.18,
1: 0.82},
max_depth=20,
max_features=None,
min_samples_split=7,
oob_score=True,
random_state=1)),
('Gradient Boosting',
GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),
max_features=0.9,
n_estimators=250,
random_state=1,
subsample=0.8)),
('AdaBoost',
AdaB...
gpu_id=None, grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=0.2,
max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None,
max_depth=None,
max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
n_estimators=50, n_jobs=None,
num_parallel_tree=None,
predictor=None, random_state=1, ...))RandomForestClassifier(class_weight={0: 0.18, 1: 0.82}, max_depth=20,
max_features=None, min_samples_split=7, oob_score=True,
random_state=1)AdaBoostClassifier(random_state=1)
AdaBoostClassifier(random_state=1)
DecisionTreeClassifier(max_depth=3)
DecisionTreeClassifier(max_depth=3)
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=1, colsample_bynode=None, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric='logloss', feature_types=None, gamma=None,
gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.2, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=50, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=1, ...)#Calculating different metrics
stacking_classifier_model_train_perf=model_performance_classification_sklearn(stacking_classifier,X_train,y_train)
print("Training performance:\n",stacking_classifier_model_train_perf)
stacking_classifier_model_test_perf=model_performance_classification_sklearn(stacking_classifier,X_test,y_test)
print("Testing performance:\n",stacking_classifier_model_test_perf)
#Creating confusion matrix
confusion_matrix_sklearn(stacking_classifier,X_test,y_test)
Training performance:
Accuracy Recall Precision F1
0 0.98 1.00 0.91 0.95
Testing performance:
Accuracy Recall Precision F1
0 0.90 0.82 0.73 0.78
# training performance comparison
models_train_comp_df = pd.concat(
[
d_tree_model_train_perf.T,
dtree_estimator_model_train_perf.T,
rf_estimator_model_train_perf.T,
rf_tuned_model_train_perf.T,
bagging_classifier_model_train_perf.T,
bagging_estimator_tuned_model_train_perf.T,
ab_classifier_model_train_perf.T,
abc_tuned_model_train_perf.T,
gb_classifier_model_train_perf.T,
gbc_tuned_model_train_perf.
T,xgb_classifier_model_train_perf.T,
xgb_tuned_model_train_perf.T,
stacking_classifier_model_train_perf.T],
axis=1,
)
models_train_comp_df.columns = [
"Decision Tree",
"Decision Tree Estimator",
"Random Forest Estimator",
"Random Forest Tuned",
"Bagging Classifier",
"Bagging Estimator Tuned",
"Adaboost Classifier",
"Adabosst Classifier Tuned",
"Gradient Boost Classifier",
"Gradient Boost Classifier Tuned",
"XGBoost Classifier",
"XGBoost Classifier Tuned",
"Stacking Classifier"]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| Decision Tree | Decision Tree Estimator | Random Forest Estimator | Random Forest Tuned | Bagging Classifier | Bagging Estimator Tuned | Adaboost Classifier | Adabosst Classifier Tuned | Gradient Boost Classifier | Gradient Boost Classifier Tuned | XGBoost Classifier | XGBoost Classifier Tuned | Stacking Classifier | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Accuracy | 1.00 | 1.00 | 1.00 | 1.00 | 0.99 | 1.00 | 0.85 | 1.00 | 0.90 | 0.93 | 1.00 | 0.99 | 0.98 |
| Recall | 1.00 | 1.00 | 1.00 | 0.99 | 0.97 | 1.00 | 0.36 | 1.00 | 0.51 | 0.70 | 1.00 | 1.00 | 1.00 |
| Precision | 1.00 | 1.00 | 1.00 | 0.98 | 1.00 | 1.00 | 0.71 | 1.00 | 0.90 | 0.95 | 1.00 | 0.95 | 0.91 |
| F1 | 1.00 | 1.00 | 1.00 | 0.99 | 0.98 | 1.00 | 0.48 | 1.00 | 0.65 | 0.80 | 1.00 | 0.98 | 0.95 |
There are some really good models built here. Random Forrest, Bagging and XGBoost are pretty good, I would have to lean most towards Random Forest as a predictions model though.
feature_names = X_train.columns
importances = rf_tuned.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='violet', align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
df.groupby(['ProdTaken'])['Age'].mean()
ProdTaken 0 37.93 1 34.33 Name: Age, dtype: float64
df.groupby(['ProdTaken'])['DurationOfPitch'].mean()
ProdTaken 0 15.22 1 17.00 Name: DurationOfPitch, dtype: float64
df.groupby(['ProdTaken'])['MonthlyIncome'].mean()
ProdTaken 0 23378.20 1 21898.13 Name: MonthlyIncome, dtype: float64
print(df.groupby(['ProdTaken'])['PitchSatisfactionScore'].mean(),
df.groupby(['ProdTaken'])['PitchSatisfactionScore'].min(),
df.groupby(['ProdTaken'])['PitchSatisfactionScore'].max())
ProdTaken 0 3.03 1 3.21 Name: PitchSatisfactionScore, dtype: float64 ProdTaken 0 1 1 1 Name: PitchSatisfactionScore, dtype: int64 ProdTaken 0 5 1 5 Name: PitchSatisfactionScore, dtype: int64